re
import requests
from fake_useragent import UserAgent
import re
url = 'http://c.biancheng.net/python_spider/re-module.html'
headers = {
"User-Agent": UserAgent().random
}
# 构造请求
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
info = response.text
# print(info)
# 寻找HTML规律,书写正则表达式,使用正则表达式分组提取信息
pattern = re.compile(r'<div id="contents">.*<div id="article-wrap">', re.S)
r_content = pattern.findall(info)[0]
# <a href="/python_spider/what-is-spider.html">网络爬虫是什么</a>
pattern = re.compile(r'<a href="(?P<href>.*?)">(?P<title>.*?)</a>', re.S)
r_list = pattern.finditer(r_content)
# 整理数据格式并输出
if r_list:
for r_info in r_list:
if ".html" not in r_info.group('href'):
continue
print("href:", r_info.group('href'))
print("title:", r_info.group('title'))
print(20 * "*")
out:
href: /python_spider/what-is-spider.html
title: 网络爬虫是什么
********************
href: /python_spider/webpage.html
title: 网页构成
********************